library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(haven)


###################################################
#                  CLUSTER 1                      #
###################################################

rm(list=ls())
setwd("./raw articles/sets/cluster 1")
all <- list.files(path = "./raw articles/sets/cluster 1", recursive = TRUE, pattern = ".txt")

l <- lapply(all, fread, sep=NULL, header=FALSE)
dt <- rbindlist( l )
colnames(dt) <- c("text")

# there is no article separator above the first article of a file
# but the separator can be put in place where "News Articles:" and "Artikler:" are
dt$text <- gsub("^Artikler:$",
"==============================================================================", dt$text, perl=TRUE)
dt$text <- gsub("^News Articles:$",
"==============================================================================", dt$text, perl=TRUE)

dt$split <- ifelse(dt$text == "==============================================================================", 1, 0)
dt$id <- cumsum(dt$split)

# remove table of content included in some files
dt$toc <- ifelse(str_starts(dt$text, "\\>\\s"), 1, 0)
dt <- as.data.frame(dt[which(dt$toc==0),])

# remove blank lines
dt <- as.data.frame(dt[which(dt$text!=""),])

meta <- dt %>%
    group_by(id) %>%
    mutate(count = seq(n()))
meta <- as.data.frame(meta)

##### create dummy that indicates in which line the page is
meta$date_loc <- ifelse((grepl("\\,\\s\\d{2}\\.\\d{2}\\.\\d{4}$", meta$text)
                       | grepl("\\,\\s\\d{4}\\-\\d{2}\\-\\d{2}$", meta$text))
                       & meta$count > 0 & meta$count < 7, 1, 0)

meta <- meta %>% group_by(id) %>% mutate(after_date = cumsum(date_loc))
meta <- meta %>% group_by(id) %>% mutate(after_date2 = cumsum(after_date))

meta$copyr <- ifelse(str_detect(meta$text, "\\u00a9\\s"), 1, 0)
meta <- meta %>% group_by(id) %>% mutate(after_copy = cumsum(copyr))

meta$pub <- ifelse(str_detect(meta$text, "Udgivet på print") | str_detect(meta$text, "Publicerat i print"), 1, 0)

page <- as.data.frame(meta[which(meta$after_date2==2),])
page$text <- gsub("Side\\s", "", page$text, perl=TRUE)
page$text <- gsub("Sida\\s", "", page$text, perl=TRUE)
page$text <- as.numeric(gsub("\\,[^,]*$", "", page$text, perl=TRUE))
page <- subset(page, select=c(text, id))
colnames(page) <- c("page", "id")

headline <- as.data.frame(meta[which(meta$after_date==0 & meta$text!="=============================================================================="),])
headline <- subset(headline, select=c(text, id))
colnames(headline) <- c("headline", "id")
headline <- aggregate(headline~id, data = headline, paste0, collapse=" ")


date <- as.data.frame(meta[which(meta$date_loc==1),])
date <- subset(date, select=c(text, id))
colnames(date) <- c("outlet_date", "id")

date <- date %>%
  separate(outlet_date, c("outlet", "date"), ", ")

# take care of two different time formats
date1 <- date %>%
  separate(date, c("day", "month", "year"), "\\.")
date1 <- subset(date1, select=c(day, month, year, id))
date1 <- na.omit(date1)
date1$date <- as.Date(with(date1, paste(year, month, day,sep="-")), "%Y-%m-%d")
date1 <- subset(date1, select=c(date, id))

date2 <- date %>%
  separate(date, c("year", "month", "day"), "-")
date2 <- subset(date2, select=c(day, month, year, id))
date2 <- na.omit(date2)
date2$date <- as.Date(with(date2, paste(year, month, day,sep="-")), "%Y-%m-%d")
date2 <- subset(date2, select=c(date, id))

temp <- rbind(date1, date2)
date <- subset(date, select=-c(date))
date <- merge(date, temp, by = "id")

text <- as.data.frame(meta[which(meta$after_date2>2 & meta$after_copy==0
                   & meta$pub==0),])
text <- subset(text, select=c(text, id))
text <- aggregate(text~id, data = text, paste0, collapse=" ")

#### combine everything
temp1 <- merge(date, page, by = "id")
temp2 <- merge(temp1, headline, by = "id")
cluster1 <- merge(temp2, text, by = "id")
cluster1$id <- paste("1c", cluster1$id, sep="")


setwd("./raw articles")
write_dta(cluster1, "cluster1.dta")









###################################################
#                  CLUSTER 2                      #
###################################################

rm(list=ls())
setwd("./raw articles/sets/cluster 2")
all <- list.files(path = "./raw articles/sets/cluster 2", recursive = TRUE, pattern = ".txt")

l <- lapply(all, fread, sep=NULL, header=FALSE)
dt <- rbindlist( l )
colnames(dt) <- c("text")

# there is no article separator above the first article of a file
# but the separator can be put in place where "News Articles:" and "Artikler:" are
dt$text <- gsub("^Artikler:$",
"==============================================================================", dt$text, perl=TRUE)
dt$text <- gsub("^News Articles:$",
"==============================================================================", dt$text, perl=TRUE)

dt$split <- ifelse(dt$text == "==============================================================================", 1, 0)
dt$id <- cumsum(dt$split)

# remove table of content included in some files
dt$toc <- ifelse(str_starts(dt$text, "\\>\\s"), 1, 0)
dt <- as.data.frame(dt[which(dt$toc==0),])

# remove blank lines
dt <- as.data.frame(dt[which(dt$text!=""),])

meta <- dt %>%
    group_by(id) %>%
    mutate(count = seq(n()))
meta <- as.data.frame(meta)

##### create dummy that indicates in which line the page is
meta$date_loc <- ifelse((grepl("\\,\\s\\d{2}\\.\\d{2}\\.\\d{4}$", meta$text)
                       | grepl("\\,\\s\\d{4}\\-\\d{2}\\-\\d{2}$", meta$text))
                       & meta$count > 0 & meta$count < 7, 1, 0)

meta <- meta %>% group_by(id) %>% mutate(after_date = cumsum(date_loc))
meta <- meta %>% group_by(id) %>% mutate(after_date2 = cumsum(after_date))

meta$copyr <- ifelse(str_detect(meta$text, "\\u00a9\\s"), 1, 0)
meta <- meta %>% group_by(id) %>% mutate(after_copy = cumsum(copyr))

meta$pub <- ifelse(str_detect(meta$text, "Udgivet på print") | str_detect(meta$text, "Publicerat i print"), 1, 0)

page <- as.data.frame(meta[which(meta$after_date2==2),])
page$text <- gsub("Side\\s", "", page$text, perl=TRUE)
page$text <- gsub("Sida\\s", "", page$text, perl=TRUE)
page$text <- as.numeric(gsub("\\,[^,]*$", "", page$text, perl=TRUE))
page <- subset(page, select=c(text, id))
colnames(page) <- c("page", "id")

headline <- as.data.frame(meta[which(meta$after_date==0 & meta$text!="=============================================================================="),])
headline <- subset(headline, select=c(text, id))
colnames(headline) <- c("headline", "id")
headline <- aggregate(headline~id, data = headline, paste0, collapse=" ")


date <- as.data.frame(meta[which(meta$date_loc==1),])
date <- subset(date, select=c(text, id))
colnames(date) <- c("outlet_date", "id")

date <- date %>%
  separate(outlet_date, c("outlet", "date"), ", ")

# take care of two different time formats
date1 <- date %>%
  separate(date, c("day", "month", "year"), "\\.")
date1 <- subset(date1, select=c(day, month, year, id))
date1 <- na.omit(date1)
date1$date <- as.Date(with(date1, paste(year, month, day,sep="-")), "%Y-%m-%d")
date1 <- subset(date1, select=c(date, id))

date2 <- date %>%
  separate(date, c("year", "month", "day"), "-")
date2 <- subset(date2, select=c(day, month, year, id))
date2 <- na.omit(date2)
date2$date <- as.Date(with(date2, paste(year, month, day,sep="-")), "%Y-%m-%d")
date2 <- subset(date2, select=c(date, id))

temp <- rbind(date1, date2)
date <- subset(date, select=-c(date))
date <- merge(date, temp, by = "id")

text <- as.data.frame(meta[which(meta$after_date2>2 & meta$after_copy==0
                   & meta$pub==0),])
text <- subset(text, select=c(text, id))
text <- aggregate(text~id, data = text, paste0, collapse=" ")

#### combine everything
temp1 <- merge(date, page, by = "id")
temp2 <- merge(temp1, headline, by = "id")
cluster2 <- merge(temp2, text, by = "id")
cluster2$id <- paste("2c", cluster2$id, sep="")

setwd("./raw articles")
write_dta(cluster2, "cluster2.dta")






###################################################
#                  CLUSTER 3                      #
###################################################

rm(list=ls())
setwd("./raw articles/sets/cluster 3")
all <- list.files(path = "./raw articles/sets/cluster 3", recursive = TRUE, pattern = ".txt")

l <- lapply(all, fread, sep=NULL, header=FALSE)
dt <- rbindlist( l )
colnames(dt) <- c("text")

# there is no article separator above the first article of a file
# but the separator can be put in place where "News Articles:" and "Artikler:" are
dt$text <- gsub("^Artikler:$",
"==============================================================================", dt$text, perl=TRUE)
dt$text <- gsub("^News Articles:$",
"==============================================================================", dt$text, perl=TRUE)

dt$split <- ifelse(dt$text == "==============================================================================", 1, 0)
dt$id <- cumsum(dt$split)

# remove table of content included in some files
dt$toc <- ifelse(str_starts(dt$text, "\\>\\s"), 1, 0)
dt <- as.data.frame(dt[which(dt$toc==0),])

# remove blank lines
dt <- as.data.frame(dt[which(dt$text!=""),])

meta <- dt %>%
    group_by(id) %>%
    mutate(count = seq(n()))
meta <- as.data.frame(meta)

##### create dummy that indicates in which line the page is
meta$date_loc <- ifelse((grepl("\\,\\s\\d{2}\\.\\d{2}\\.\\d{4}$", meta$text)
                       | grepl("\\,\\s\\d{4}\\-\\d{2}\\-\\d{2}$", meta$text))
                       & meta$count > 0 & meta$count < 7, 1, 0)

meta <- meta %>% group_by(id) %>% mutate(after_date = cumsum(date_loc))
meta <- meta %>% group_by(id) %>% mutate(after_date2 = cumsum(after_date))

meta$copyr <- ifelse(str_detect(meta$text, "\\u00a9\\s"), 1, 0)
meta <- meta %>% group_by(id) %>% mutate(after_copy = cumsum(copyr))

meta$pub <- ifelse(str_detect(meta$text, "Udgivet på print") | str_detect(meta$text, "Publicerat i print"), 1, 0)

page <- as.data.frame(meta[which(meta$after_date2==2),])
page$text <- gsub("Side\\s", "", page$text, perl=TRUE)
page$text <- gsub("Sida\\s", "", page$text, perl=TRUE)
page$text <- as.numeric(gsub("\\,[^,]*$", "", page$text, perl=TRUE))
page <- subset(page, select=c(text, id))
colnames(page) <- c("page", "id")

headline <- as.data.frame(meta[which(meta$after_date==0 & meta$text!="=============================================================================="),])
headline <- subset(headline, select=c(text, id))
colnames(headline) <- c("headline", "id")
headline <- aggregate(headline~id, data = headline, paste0, collapse=" ")


date <- as.data.frame(meta[which(meta$date_loc==1),])
date <- subset(date, select=c(text, id))
colnames(date) <- c("outlet_date", "id")

date <- date %>%
  separate(outlet_date, c("outlet", "date"), ", ")

# take care of two different time formats
date1 <- date %>%
  separate(date, c("day", "month", "year"), "\\.")
date1 <- subset(date1, select=c(day, month, year, id))
date1 <- na.omit(date1)
date1$date <- as.Date(with(date1, paste(year, month, day,sep="-")), "%Y-%m-%d")
date1 <- subset(date1, select=c(date, id))

date2 <- date %>%
  separate(date, c("year", "month", "day"), "-")
date2 <- subset(date2, select=c(day, month, year, id))
date2 <- na.omit(date2)
date2$date <- as.Date(with(date2, paste(year, month, day,sep="-")), "%Y-%m-%d")
date2 <- subset(date2, select=c(date, id))

temp <- rbind(date1, date2)
date <- subset(date, select=-c(date))
date <- merge(date, temp, by = "id")

text <- as.data.frame(meta[which(meta$after_date2>2 & meta$after_copy==0
                   & meta$pub==0),])
text <- subset(text, select=c(text, id))
text <- aggregate(text~id, data = text, paste0, collapse=" ")

#### combine everything
temp1 <- merge(date, page, by = "id")
temp2 <- merge(temp1, headline, by = "id")
cluster3 <- merge(temp2, text, by = "id")
cluster3$id <- paste("3c", cluster3$id, sep="")

setwd("./raw articles")
write_dta(cluster3, "cluster3.dta")






###################################################
#                  CLUSTER 4                      #
###################################################

rm(list=ls())
setwd("./raw articles/sets/cluster 4")
all <- list.files(path = "./raw articles/sets/cluster 4", recursive = TRUE, pattern = ".txt")

l <- lapply(all, fread, sep=NULL, header=FALSE)
dt <- rbindlist( l )
colnames(dt) <- c("text")

# there is no article separator above the first article of a file
# but the separator can be put in place where "News Articles:" and "Artikler:" are
dt$text <- gsub("^Artikler:$",
"==============================================================================", dt$text, perl=TRUE)
dt$text <- gsub("^News Articles:$",
"==============================================================================", dt$text, perl=TRUE)

dt$split <- ifelse(dt$text == "==============================================================================", 1, 0)
dt$id <- cumsum(dt$split)

# remove table of content included in some files
dt$toc <- ifelse(str_starts(dt$text, "\\>\\s"), 1, 0)
dt <- as.data.frame(dt[which(dt$toc==0),])

# remove blank lines
dt <- as.data.frame(dt[which(dt$text!=""),])

meta <- dt %>%
    group_by(id) %>%
    mutate(count = seq(n()))
meta <- as.data.frame(meta)

##### create dummy that indicates in which line the page is
meta$date_loc <- ifelse((grepl("\\,\\s\\d{2}\\.\\d{2}\\.\\d{4}$", meta$text)
                       | grepl("\\,\\s\\d{4}\\-\\d{2}\\-\\d{2}$", meta$text))
                       & meta$count > 0 & meta$count < 7, 1, 0)

meta <- meta %>% group_by(id) %>% mutate(after_date = cumsum(date_loc))
meta <- meta %>% group_by(id) %>% mutate(after_date2 = cumsum(after_date))

meta$copyr <- ifelse(str_detect(meta$text, "\\u00a9\\s"), 1, 0)
meta <- meta %>% group_by(id) %>% mutate(after_copy = cumsum(copyr))

meta$pub <- ifelse(str_detect(meta$text, "Udgivet på print") | str_detect(meta$text, "Publicerat i print"), 1, 0)

page <- as.data.frame(meta[which(meta$after_date2==2),])
page$text <- gsub("Side\\s", "", page$text, perl=TRUE)
page$text <- gsub("Sida\\s", "", page$text, perl=TRUE)
page$text <- as.numeric(gsub("\\,[^,]*$", "", page$text, perl=TRUE))
page <- subset(page, select=c(text, id))
colnames(page) <- c("page", "id")

headline <- as.data.frame(meta[which(meta$after_date==0 & meta$text!="=============================================================================="),])
headline <- subset(headline, select=c(text, id))
colnames(headline) <- c("headline", "id")
headline <- aggregate(headline~id, data = headline, paste0, collapse=" ")


date <- as.data.frame(meta[which(meta$date_loc==1),])
date <- subset(date, select=c(text, id))
colnames(date) <- c("outlet_date", "id")

date <- date %>%
  separate(outlet_date, c("outlet", "date"), ", ")

# take care of two different time formats
date1 <- date %>%
  separate(date, c("day", "month", "year"), "\\.")
date1 <- subset(date1, select=c(day, month, year, id))
date1 <- na.omit(date1)
date1$date <- as.Date(with(date1, paste(year, month, day,sep="-")), "%Y-%m-%d")
date1 <- subset(date1, select=c(date, id))

date2 <- date %>%
  separate(date, c("year", "month", "day"), "-")
date2 <- subset(date2, select=c(day, month, year, id))
date2 <- na.omit(date2)
date2$date <- as.Date(with(date2, paste(year, month, day,sep="-")), "%Y-%m-%d")
date2 <- subset(date2, select=c(date, id))

temp <- rbind(date1, date2)
date <- subset(date, select=-c(date))
date <- merge(date, temp, by = "id")

text <- as.data.frame(meta[which(meta$after_date2>2 & meta$after_copy==0
                   & meta$pub==0),])
text <- subset(text, select=c(text, id))
text <- aggregate(text~id, data = text, paste0, collapse=" ")

#### combine everything
temp1 <- merge(date, page, by = "id")
temp2 <- merge(temp1, headline, by = "id")
cluster4 <- merge(temp2, text, by = "id")
cluster4$id <- paste("4c", cluster4$id, sep="")

setwd("./raw articles")
write_dta(cluster4, "cluster4.dta")







###################################################
#                  CLUSTER 5                      #
###################################################

rm(list=ls())
setwd("./raw articles/sets/cluster 5")
all <- list.files(path = "./raw articles/sets/cluster 5", recursive = TRUE, pattern = ".txt")

l <- lapply(all, fread, sep=NULL, header=FALSE)
dt <- rbindlist( l )
colnames(dt) <- c("text")

# there is no article separator above the first article of a file
# but the separator can be put in place where "News Articles:" and "Artikler:" are
dt$text <- gsub("^Artikler:$",
"==============================================================================", dt$text, perl=TRUE)
dt$text <- gsub("^News Articles:$",
"==============================================================================", dt$text, perl=TRUE)

dt$split <- ifelse(dt$text == "==============================================================================", 1, 0)
dt$id <- cumsum(dt$split)

# remove table of content included in some files
dt$toc <- ifelse(str_starts(dt$text, "\\>\\s"), 1, 0)
dt <- as.data.frame(dt[which(dt$toc==0),])

# remove blank lines
dt <- as.data.frame(dt[which(dt$text!=""),])

meta <- dt %>%
    group_by(id) %>%
    mutate(count = seq(n()))
meta <- as.data.frame(meta)

##### create dummy that indicates in which line the page is
meta$date_loc <- ifelse((grepl("\\,\\s\\d{2}\\.\\d{2}\\.\\d{4}$", meta$text)
                       | grepl("\\,\\s\\d{4}\\-\\d{2}\\-\\d{2}$", meta$text))
                       & meta$count > 0 & meta$count < 7, 1, 0)

meta <- meta %>% group_by(id) %>% mutate(after_date = cumsum(date_loc))
meta <- meta %>% group_by(id) %>% mutate(after_date2 = cumsum(after_date))

meta$copyr <- ifelse(str_detect(meta$text, "\\u00a9\\s"), 1, 0)
meta <- meta %>% group_by(id) %>% mutate(after_copy = cumsum(copyr))

meta$pub <- ifelse(str_detect(meta$text, "Udgivet på print") | str_detect(meta$text, "Publicerat i print"), 1, 0)

page <- as.data.frame(meta[which(meta$after_date2==2),])
page$text <- gsub("Side\\s", "", page$text, perl=TRUE)
page$text <- gsub("Sida\\s", "", page$text, perl=TRUE)
page$text <- as.numeric(gsub("\\,[^,]*$", "", page$text, perl=TRUE))
page <- subset(page, select=c(text, id))
colnames(page) <- c("page", "id")

headline <- as.data.frame(meta[which(meta$after_date==0 & meta$text!="=============================================================================="),])
headline <- subset(headline, select=c(text, id))
colnames(headline) <- c("headline", "id")
headline <- aggregate(headline~id, data = headline, paste0, collapse=" ")


date <- as.data.frame(meta[which(meta$date_loc==1),])
date <- subset(date, select=c(text, id))
colnames(date) <- c("outlet_date", "id")

date <- date %>%
  separate(outlet_date, c("outlet", "date"), ", ")

# take care of two different time formats
date1 <- date %>%
  separate(date, c("day", "month", "year"), "\\.")
date1 <- subset(date1, select=c(day, month, year, id))
date1 <- na.omit(date1)
date1$date <- as.Date(with(date1, paste(year, month, day,sep="-")), "%Y-%m-%d")
date1 <- subset(date1, select=c(date, id))

date2 <- date %>%
  separate(date, c("year", "month", "day"), "-")
date2 <- subset(date2, select=c(day, month, year, id))
date2 <- na.omit(date2)
date2$date <- as.Date(with(date2, paste(year, month, day,sep="-")), "%Y-%m-%d")
date2 <- subset(date2, select=c(date, id))

temp <- rbind(date1, date2)
date <- subset(date, select=-c(date))
date <- merge(date, temp, by = "id")

text <- as.data.frame(meta[which(meta$after_date2>2 & meta$after_copy==0
                   & meta$pub==0),])
text <- subset(text, select=c(text, id))
text <- aggregate(text~id, data = text, paste0, collapse=" ")

#### combine everything
temp1 <- merge(date, page, by = "id")
temp2 <- merge(temp1, headline, by = "id")
cluster5 <- merge(temp2, text, by = "id")
cluster5$id <- paste("5c", cluster5$id, sep="")

setwd("./raw articles")
write_dta(cluster5, "cluster5.dta")

